Перед вами датасет с данными о вреде курения
Структура данных:
№1 Восстановить целостностью данных, реализовать функцию позволяющие выполнять заполнение данных по среднему или по медианному значению в Series.
№2 Построить baseline-модель, попытаться выбить лучший скор (сравнить несколько моделей)
№3 Сделать визуализацию для мини-исследования о вреде курения с разбивкой по половой принадлежности пациентов с помощью pandas, matplotlib, seaborn, plotly и других инструментов.
import pandas as pd
import numpy as np
import seaborn as sns
import math
url = 'https://drive.google.com/file/d/1He5GI5_Gd8uXYfeETLBISQ5BszX0o4pU/view?usp=sharing'
url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
data = pd.read_csv(url)
data = data.drop('Unnamed: 0',axis=1)
data = data.drop(0,axis=0)
data = data.sample(frac=1)
data = data.reset_index(drop=True)
data.head(10)
| gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 43.0 | 0.0 | 0.0 | Yes | Private | Rural | 58.63 | 28.4 | smokes | 0 |
| 1 | Female | 29.0 | 0.0 | 0.0 | Yes | Private | Rural | 72.52 | 33.9 | never smoked | 0 |
| 2 | Female | 40.0 | 0.0 | NaN | Yes | Govt_job | Rural | 110.6 | 33.3 | formerly smoked | 0 |
| 3 | Female | 32.0 | 0.0 | 0.0 | No | Self-employed | Rural | 93.17 | 27.5 | smokes | 0 |
| 4 | Female | 3.0 | 0.0 | 0.0 | No | children | Rural | 97.6 | 25.8 | Unknown | 0 |
| 5 | Female | 20.0 | 0.0 | 0.0 | No | Private | Rural | 93.74 | 23.7 | Unknown | 0 |
| 6 | Female | 34.0 | 0.0 | 0.0 | Yes | Govt_job | Rural | 70.18 | 24.9 | Unknown | 0 |
| 7 | Female | 51.0 | 1.0 | 0.0 | Yes | Private | Urban | 109.16 | 28.0 | smokes | 0 |
| 8 | Female | 79.0 | 0.0 | 0.0 | Yes | NaN | Urban | 57.77 | NaN | formerly smoked | 0 |
| 9 | Female | 46.0 | 1.0 | 0.0 | Yes | Private | Rural | 81.58 | 36.2 | never smoked | 0 |
def my_fill_na_avg(df, columnName):
columnsType = {'gender':'str', 'age':'int', 'hypertension':'int', 'heart_disease':'int', 'ever_married':'str',
'work_type':'str', 'Residence_type':'str', 'avg_glucose_level':'float', 'bmi':'float',
'smoking_status':'str', 'stroke':'int'}
if not columnName in columnsType:
return -1
else:
col = df[columnName]
sum = 0
countObj = 0
if (columnsType[columnName] == 'int'):
for row in col:
if not pd.isna(row):
sum += int(row.split('.')[0])
countObj += 1
avg = sum / countObj if countObj != 0 else -1
if avg == -1:
return col
count = 0
for row in col:
if pd.isna(row):
col[count] = int(round(avg))
else:
col[count] = int(row.split('.')[0])
count += 1
if (columnsType[columnName] == 'float'):
for row in col:
if not pd.isna(row):
sum += float(row)
countObj += 1
avg = sum / countObj if countObj != 0 else -1
if avg == -1:
return col
count = 0
for row in col:
if pd.isna(row):
col[count] = round(float(avg),2)
else:
col[count] = round(float(row),2)
count += 1
if (columnsType[columnName] == 'str'):
avgStr = dict()
for row in col:
#выберем часто встречающийся объект, т.к. невозможно найти среднее между string
if not pd.isna(row):
count = avgStr[row] if row in avgStr.keys() else 1
avgStr[row] = count+1
max_val = max(avgStr.values())
avg = [k for k, v in avgStr.items() if v == max_val]
count = 0
for row in col:
if pd.isna(row):
col[count] = avg[0]
else:
col[count] = row
count += 1
return col
data_avg = data.copy()
for col in data.columns:
data_avg[col] = my_fill_na_avg(data_avg, col)
data_avg.head(10)
| gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 43 | 0 | 0 | Yes | Private | Rural | 58.63 | 28.4 | smokes | 0 |
| 1 | Female | 29 | 0 | 0 | Yes | Private | Rural | 72.52 | 33.9 | never smoked | 0 |
| 2 | Female | 40 | 0 | 0 | Yes | Govt_job | Rural | 110.6 | 33.3 | formerly smoked | 0 |
| 3 | Female | 32 | 0 | 0 | No | Self-employed | Rural | 93.17 | 27.5 | smokes | 0 |
| 4 | Female | 3 | 0 | 0 | No | children | Rural | 97.6 | 25.8 | Unknown | 0 |
| 5 | Female | 20 | 0 | 0 | No | Private | Rural | 93.74 | 23.7 | Unknown | 0 |
| 6 | Female | 34 | 0 | 0 | Yes | Govt_job | Rural | 70.18 | 24.9 | Unknown | 0 |
| 7 | Female | 51 | 1 | 0 | Yes | Private | Urban | 109.16 | 28.0 | smokes | 0 |
| 8 | Female | 79 | 0 | 0 | Yes | Private | Urban | 57.77 | 28.94 | formerly smoked | 0 |
| 9 | Female | 46 | 1 | 0 | Yes | Private | Rural | 81.58 | 36.2 | never smoked | 0 |
def my_fill_na_mediana(df, columnName):
columnsType = {'gender':'str', 'age':'int', 'hypertension':'int', 'heart_disease':'int', 'ever_married':'str',
'work_type':'str', 'Residence_type':'str', 'avg_glucose_level':'float', 'bmi':'float',
'smoking_status':'str', 'stroke':'int'}
if not columnName in columnsType:
return -1
else:
col = df[columnName]
median = 0
medList = list()
if (columnsType[columnName] == 'int'):
for row in col:
if not pd.isna(row):
medList.append(int(row.split('.')[0]))
medList.sort()
if len(medList) % 2 == 0:
median = int((medList[int(len(medList)/2-1)]+medList[int(len(medList)/2)])/2)
else:
median = int(medList[math.floor(len(medList)/2)])
count = 0
for row in col:
if pd.isna(row):
col[count] = int(round(median))
else:
col[count] = int(row.split('.')[0])
count += 1
if (columnsType[columnName] == 'float'):
for row in col:
if not pd.isna(row):
medList.append(float(row))
medList.sort()
if len(medList) % 2 == 0:
median = float((medList[int(len(medList)/2-1)]+medList[int(len(medList)/2)])/2)
else:
median = float(medList[math.floor(len(medList)/2)])
count = 0
for row in col:
if pd.isna(row):
col[count] = float(round(median))
else:
col[count] = float(row)
count += 1
if (columnsType[columnName] == 'str'):
for row in col:
if not pd.isna(row):
medList.append(str(row))
medList.sort()
#т.к. невозможно выбрать среднее между двумя string
median = str(medList[math.floor(len(medList)/2)])
count = 0
for row in col:
if pd.isna(row):
col[count] = str(median)
else:
col[count] = str(row)
count += 1
return col
data_mediana = data.copy()
for col in data.columns:
data_mediana[col] = my_fill_na_mediana(data_mediana, col)
data_mediana.head(10)
| gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 43 | 0 | 0 | Yes | Private | Rural | 58.63 | 28.4 | smokes | 0 |
| 1 | Female | 29 | 0 | 0 | Yes | Private | Rural | 72.52 | 33.9 | never smoked | 0 |
| 2 | Female | 40 | 0 | 0 | Yes | Govt_job | Rural | 110.6 | 33.3 | formerly smoked | 0 |
| 3 | Female | 32 | 0 | 0 | No | Self-employed | Rural | 93.17 | 27.5 | smokes | 0 |
| 4 | Female | 3 | 0 | 0 | No | children | Rural | 97.6 | 25.8 | Unknown | 0 |
| 5 | Female | 20 | 0 | 0 | No | Private | Rural | 93.74 | 23.7 | Unknown | 0 |
| 6 | Female | 34 | 0 | 0 | Yes | Govt_job | Rural | 70.18 | 24.9 | Unknown | 0 |
| 7 | Female | 51 | 1 | 0 | Yes | Private | Urban | 109.16 | 28.0 | smokes | 0 |
| 8 | Female | 79 | 0 | 0 | Yes | Private | Urban | 57.77 | 28.0 | formerly smoked | 0 |
| 9 | Female | 46 | 1 | 0 | Yes | Private | Rural | 81.58 | 36.2 | never smoked | 0 |
data_avg.drop_duplicates(ignore_index=True)
data_mediana.drop_duplicates(ignore_index=True)
| gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 43 | 0 | 0 | Yes | Private | Rural | 58.63 | 28.4 | smokes | 0 |
| 1 | Female | 29 | 0 | 0 | Yes | Private | Rural | 72.52 | 33.9 | never smoked | 0 |
| 2 | Female | 40 | 0 | 0 | Yes | Govt_job | Rural | 110.6 | 33.3 | formerly smoked | 0 |
| 3 | Female | 32 | 0 | 0 | No | Self-employed | Rural | 93.17 | 27.5 | smokes | 0 |
| 4 | Female | 3 | 0 | 0 | No | children | Rural | 97.6 | 25.8 | Unknown | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5101 | Male | 1 | 0 | 0 | No | children | Rural | 78.53 | 19.8 | Unknown | 0 |
| 5102 | Male | 65 | 0 | 0 | Yes | Private | Rural | 236.14 | 28.0 | Unknown | 0 |
| 5103 | Female | 50 | 0 | 0 | Yes | Self-employed | Urban | 110.18 | 26.0 | formerly smoked | 0 |
| 5104 | Male | 81 | 0 | 1 | Yes | Self-employed | Rural | 68.27 | 25.0 | Unknown | 0 |
| 5105 | Male | 65 | 1 | 0 | Yes | Self-employed | Urban | 113.86 | 36.4 | never smoked | 0 |
5106 rows × 11 columns
columnsType = {'gender':'str', 'age':'int', 'hypertension':'int', 'heart_disease':'int', 'ever_married':'str',
'work_type':'str', 'Residence_type':'str', 'avg_glucose_level':'float', 'bmi':'float',
'smoking_status':'str', 'stroke':'int'}
for col in data.columns:
type = columnsType[col]
data_avg[col] = data_avg[col].astype(str).astype(type)
data_mediana[col] = data_mediana[col].astype(str).astype(type)
print(data_avg.info())
print(data_mediana.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5110 entries, 0 to 5109 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 5110 non-null object 1 age 5110 non-null int32 2 hypertension 5110 non-null int32 3 heart_disease 5110 non-null int32 4 ever_married 5110 non-null object 5 work_type 5110 non-null object 6 Residence_type 5110 non-null object 7 avg_glucose_level 5110 non-null float64 8 bmi 5110 non-null float64 9 smoking_status 5110 non-null object 10 stroke 5110 non-null int32 dtypes: float64(2), int32(4), object(5) memory usage: 359.4+ KB None <class 'pandas.core.frame.DataFrame'> RangeIndex: 5110 entries, 0 to 5109 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 5110 non-null object 1 age 5110 non-null int32 2 hypertension 5110 non-null int32 3 heart_disease 5110 non-null int32 4 ever_married 5110 non-null object 5 work_type 5110 non-null object 6 Residence_type 5110 non-null object 7 avg_glucose_level 5110 non-null float64 8 bmi 5110 non-null float64 9 smoking_status 5110 non-null object 10 stroke 5110 non-null int32 dtypes: float64(2), int32(4), object(5) memory usage: 359.4+ KB None
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 10))
plt.plot(data_avg['gender'])
plt.show()
#Удалим выброс
data_avg = data_avg.loc[data_avg['gender'] != 'Other']
data_avg = data_avg.reset_index(drop=True)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
features = data_avg.drop("stroke", axis=1)
target = data_avg["stroke"]
bad_series = features[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']]
features_oe = features.copy()
encoder = OrdinalEncoder()
features_oe[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']] = pd.DataFrame(encoder.fit_transform(bad_series), columns=bad_series.columns)
features_oe.head()
| gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 43 | 0 | 0 | 1.0 | 2.0 | 0.0 | 58.63 | 28.4 | 3.0 |
| 1 | 0.0 | 29 | 0 | 0 | 1.0 | 2.0 | 0.0 | 72.52 | 33.9 | 2.0 |
| 2 | 0.0 | 40 | 0 | 0 | 1.0 | 0.0 | 0.0 | 110.60 | 33.3 | 1.0 |
| 3 | 0.0 | 32 | 0 | 0 | 0.0 | 3.0 | 0.0 | 93.17 | 27.5 | 3.0 |
| 4 | 0.0 | 3 | 0 | 0 | 0.0 | 4.0 | 0.0 | 97.60 | 25.8 | 0.0 |
train, test, target_train, target_test = train_test_split(
features_oe, target, test_size=0.25)
rfc = RandomForestClassifier(random_state=123)
model_params = {
"n_estimators" : [10,20,30],
"max_features" : ["auto", "sqrt", "log2"],
"min_samples_split" : [2,4,8],
"bootstrap": [True, False],}
grid_rfc = GridSearchCV(rfc, model_params)
grid_rfc.fit(train, target_train)
print(grid_rfc.best_params_)
pred_log = grid_rfc.predict(test)
f_1_log = f1_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
precision = precision_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
recall = recall_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
accuracy_log = accuracy_score(target_test, pred_log)
print('Значение F1 = {}, \nзначение precision = {}, \nзначение recall = {}, \nзначение accuracy = {}'.format(f_1_log, precision, recall, accuracy_log))
{'bootstrap': True, 'max_features': 'auto', 'min_samples_split': 4, 'n_estimators': 30}
Значение F1 = 0.950876496734705,
значение precision = 0.9457362913866642,
значение recall = 0.9655712050078247,
значение accuracy = 0.9655712050078247
features = data_mediana.drop("stroke", axis=1)
target = data_mediana["stroke"]
bad_series = features[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']]
features_oe = features.copy()
encoder = OrdinalEncoder()
features_oe[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']] = pd.DataFrame(encoder.fit_transform(bad_series), columns=bad_series.columns)
train, test, target_train, target_test = train_test_split(
features_oe, target, test_size=0.25)
rfc = RandomForestClassifier(random_state=123)
model_params = {
"n_estimators" : [10,20,30],
"max_features" : ["auto", "sqrt", "log2"],
"min_samples_split" : [2,4,8],
"bootstrap": [True, False],}
grid_rfc = GridSearchCV(rfc, model_params)
grid_rfc.fit(train, target_train)
print(grid_rfc.best_params_)
pred_log = grid_rfc.predict(test)
f_1_log = f1_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
precision = precision_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
recall = recall_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
accuracy_log = accuracy_score(target_test, pred_log)
print('Значение F1 = {}, \nзначение precision = {}, \nзначение recall = {}, \nзначение accuracy = {}'.format(f_1_log, precision, recall, accuracy_log))
{'bootstrap': True, 'max_features': 'auto', 'min_samples_split': 8, 'n_estimators': 20}
Значение F1 = 0.9320605697102562,
значение precision = 0.9112589046853995,
значение recall = 0.9538341158059468,
значение accuracy = 0.9538341158059468
features = data_avg.drop("stroke", axis=1)
target = data_avg["stroke"]
bad_series = features[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']]
features_oe = features.copy()
encoder = OrdinalEncoder()
features_oe[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']] = pd.DataFrame(encoder.fit_transform(bad_series), columns=bad_series.columns)
train, test, target_train, target_test = train_test_split(
features_oe, target, test_size=0.25)
abc = AdaBoostClassifier(random_state=123)
model_params = {
"n_estimators" : [10, 20, 30, 40],
"learning_rate" : [0.2, 0.5, 0.7, 1]
}
grid_abc = GridSearchCV(abc, model_params)
grid_abc.fit(train, target_train)
print(grid_abc.best_params_)
pred_log = grid_abc.predict(test)
f_1_log = f1_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
precision = precision_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
recall = recall_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
accuracy_log = accuracy_score(target_test, pred_log)
print('Значение F1 = {}, \nзначение precision = {}, \nзначение recall = {}, \nзначение accuracy = {}'.format(f_1_log, precision, recall, accuracy_log))
{'learning_rate': 0.7, 'n_estimators': 20}
Значение F1 = 0.9788254095085897,
значение precision = 0.958528951486698,
значение recall = 1.0,
значение accuracy = 0.9585289514866979
features = data_mediana.drop("stroke", axis=1)
target = data_mediana["stroke"]
bad_series = features[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']]
features_oe = features.copy()
encoder = OrdinalEncoder()
features_oe[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']] = pd.DataFrame(encoder.fit_transform(bad_series), columns=bad_series.columns)
train, test, target_train, target_test = train_test_split(
features_oe, target, test_size=0.25)
abc = AdaBoostClassifier(random_state=123)
model_params = {
"n_estimators" : [10, 20, 30, 40],
"learning_rate" : [0.2, 0.5, 0.7, 1]
}
grid_abc = GridSearchCV(abc, model_params)
grid_abc.fit(train, target_train)
print(grid_abc.best_params_)
pred_log = grid_abc.predict(test)
f_1_log = f1_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
precision = precision_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
recall = recall_score(target_test, pred_log, average='weighted', labels=np.unique(pred_log))
accuracy_log = accuracy_score(target_test, pred_log)
print('Значение F1 = {}, \nзначение precision = {}, \nзначение recall = {}, \nзначение accuracy = {}'.format(f_1_log, precision, recall, accuracy_log))
{'learning_rate': 0.2, 'n_estimators': 10}
Значение F1 = 0.9743178170144463,
значение precision = 0.9499217527386542,
значение recall = 1.0,
значение accuracy = 0.9499217527386542
#Удаление срок с пустыми данными сократит датасет в два раза, поэтому возьмем все данные.
#Исправленые датасеты тоже не будет брать, потому что они значительно изменят достоверность статистики.
df = data.copy()
#Разделим датафрейм на два. В первом находятся данные только о мужчинах, во втором только о женщинах.
df_male = df.loc[df['gender'] == 'Male']
df_male = df_male.reset_index(drop=True)
df_female = df.loc[df['gender'] == 'Female']
df_female = df_female.reset_index(drop=True)
male_smoking_hypertension = dict()
for obj in df_male.iterrows():
if obj[1]['hypertension'] == '1.0':
row = obj[1]['smoking_status']
count = male_smoking_hypertension[row] if row in male_smoking_hypertension.keys() else 1
male_smoking_hypertension[row] = count+1
print(male_smoking_hypertension)
female_smoking_hypertension = dict()
for obj in df_female.iterrows():
if obj[1]['hypertension'] == '1.0':
row = obj[1]['smoking_status']
count = female_smoking_hypertension[row] if row in female_smoking_hypertension.keys() else 1
female_smoking_hypertension[row] = count+1
print(female_smoking_hypertension)
{'smokes': 47, 'formerly smoked': 59, 'never smoked': 85, 'Unknown': 25}
{'smokes': 46, 'never smoked': 136, 'Unknown': 26, 'formerly smoked': 57}
plt.title('Smoking status у мужчин с гипертонией')
plt.xlabel('Smoking status')
plt.ylabel('Количество мужчин', rotation=90)
names = list(male_smoking_hypertension.keys())
values = list(male_smoking_hypertension.values())
plt.bar(range(len(male_smoking_hypertension)), values, tick_label=names)
plt.show()
plt.title('Smoking status у женщин с гипертонией')
plt.xlabel('Smoking status')
plt.ylabel('Количество женщин', rotation=90)
names = list(female_smoking_hypertension.keys())
values = list(female_smoking_hypertension.values())
plt.bar(range(len(female_smoking_hypertension)), values, tick_label=names)
plt.show()
male_smoking_heart_disease = dict()
for obj in df_male.iterrows():
if obj[1]['heart_disease'] == '1.0':
row = obj[1]['smoking_status']
count = male_smoking_heart_disease[row] if row in male_smoking_heart_disease.keys() else 1
male_smoking_heart_disease[row] = count+1
print(male_smoking_heart_disease)
female_smoking_heart_diseasen = dict()
for obj in df_female.iterrows():
if obj[1]['heart_disease'] == '1.0':
row = obj[1]['smoking_status']
count = female_smoking_heart_diseasen[row] if row in female_smoking_heart_diseasen.keys() else 1
female_smoking_heart_diseasen[row] = count+1
print(female_smoking_heart_diseasen)
{'never smoked': 36, 'formerly smoked': 45, 'Unknown': 25, 'smokes': 31}
{'never smoked': 38, 'smokes': 22, 'formerly smoked': 24, 'Unknown': 17}
import plotly.express as px
names = list(male_smoking_heart_disease.keys())
values = list(male_smoking_heart_disease.values())
fig = px.pie(values=values, names=names, title='Smoking status у мужчин с болезнью сердца')
fig.update_traces(textinfo='percent+label')
fig.show()
names = list(female_smoking_heart_diseasen.keys())
values = list(female_smoking_heart_diseasen.values())
fig = px.pie(values=values, names=names, title='Smoking status у женщин с болезнью сердца')
fig.update_traces(textinfo='percent+label')
fig.show()
male_smoking_stroke = dict()
for obj in df_male.iterrows():
if obj[1]['stroke'] == '1':
row = obj[1]['smoking_status']
count = male_smoking_stroke[row] if row in male_smoking_stroke.keys() else 1
male_smoking_stroke[row] = count+1
print(male_smoking_stroke)
female_smoking_stroke = dict()
for obj in df_female.iterrows():
if obj[1]['stroke'] == '1':
row = obj[1]['smoking_status']
count = female_smoking_stroke[row] if row in female_smoking_stroke.keys() else 1
female_smoking_stroke[row] = count+1
print(female_smoking_stroke)
{'never smoked': 28, 'formerly smoked': 37, 'Unknown': 23, 'smokes': 24}
{'never smoked': 64, 'Unknown': 26, 'formerly smoked': 35, 'smokes': 20}
names1 = list(male_smoking_stroke.keys())
values1 = list(male_smoking_stroke.values())
names2 = names1
values2 = [female_smoking_stroke[names2[0]], female_smoking_stroke[names2[1]],
female_smoking_stroke[names2[2]], female_smoking_stroke[names2[3]]]
plt.figure(figsize=(10, 5))
plt.title('Smoking status у людей с перенесенным инсультом')
plt.xlabel('Smoking status')
plt.ylabel('Количество', rotation=90)
plt.plot(names1, values1, "-b", label="Male")
plt.plot(names2, values2, "-r", label="Female")
plt.legend(loc="upper left")
plt.show()
age_smoking = dict()
for obj in df_male.iterrows():
if obj[1]['smoking_status'] == 'smokes' and not pd.isna(obj[1]['age']):
row = str(obj[1]['gender']) + " " + str(obj[1]['age'])
count = age_smoking[row] if row in age_smoking.keys() else 1
age_smoking[row] = count+1
for obj in df_female.iterrows():
if obj[1]['smoking_status'] == 'smokes' and not pd.isna(obj[1]['age']):
row = str(obj[1]['gender']) + " " + str(obj[1]['age'])
count = age_smoking[row] if row in age_smoking.keys() else 1
age_smoking[row] = count+1
df_age_smoking = pd.DataFrame(columns = ['gender', 'age', 'count'])
for item in age_smoking.items():
gender, age = item[0].split(" ")
gender, age = str(gender), float(age)
row = pd.DataFrame([{'gender' : gender, 'age' : age, 'count' : item[1]}])
df_age_smoking = pd.concat([df_age_smoking, row], ignore_index=True)
import seaborn as sns
sns.set_theme(style="whitegrid")
cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True)
g = sns.relplot(
data=df_age_smoking,
x="age", y="gender", size="count",
palette=cmap, sizes=(20, 200), height=5, aspect=3,
)
g.ax.xaxis.grid(True, "minor", linewidth=.25)
g.ax.yaxis.grid(True, "minor", linewidth=.25)
g.despine(left=True, bottom=True)
<seaborn.axisgrid.FacetGrid at 0x1c743bf0eb0>